########>> INITIALIZE <<########

# === Basic Operation Libraries ===
import os
import sys
import ast
import datetime
import re
import time

# === Data Analysis Libraries ===
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
%matplotlib inline

# === Machine Learning Libraries ===
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_text, _tree
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# === Display Settings for Jupyter ===
from IPython.display import display, HTML

# === Display Settings for Pandas ===
pd.set_option('display.html.table_schema', True)
pd.set_option('expand_frame_repr', True)
pd.set_option('display.max_colwidth', 200)
pd.options.display.html.use_mathjax = False

# === Manage Warnings ===
import warnings
warnings.filterwarnings('ignore')

# === Completion Timestamp ===
print("\n{:<5} : {}".format("Finished", str(datetime.datetime.now())))

Finished : 2025-11-25 16:06:03.892187

apt = pd.read_csv("C:\\Users\\alexp\\Charlotte_Apartments.csv")

apt.head()

# Drop non-predictive or text-heavy columns
apt_cleaned = apt.drop(columns=['Address', 'Unit_Variant', 'Amenities', 'Website'])

# Encode categorical columns
apt_encoded = pd.get_dummies(apt_cleaned, columns=['Complex', 'Neighborhood'], drop_first=False)

# Define target and features for PPSF (overall)
y_ppsf = apt_encoded['price_per_sqft']
X_ppsf = apt_encoded.drop(columns=['price_per_sqft'])

# Train/test split (distinct variable names for PPSF overall)
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X_ppsf, y_ppsf, test_size=0.2, random_state=42
)

# Fit decision tree for PPSF overall
tree_ppsf = DecisionTreeRegressor(max_depth=3, min_samples_leaf=5, random_state=42)
tree_ppsf.fit(X_train_p, y_train_p)

# Visualize the tree
plt.figure(figsize=(36, 20))
plot_tree(
    tree_ppsf,
    feature_names=X_ppsf.columns,
    filled=True,
    rounded=True,
    fontsize=14
)
plt.title("PPSF Decision Tree — Overall", fontsize=18)
plt.show()

# Feature importance
importances_ppsf = pd.Series(tree_ppsf.feature_importances_, index=X_ppsf.columns)
print(importances_ppsf.sort_values(ascending=False))

# R² score for PPSF overall tree
r2_tree_ppsf = tree_ppsf.score(X_test_p, y_test_p)
print(f"R² for PPSF Decision Tree — Overall: {r2_tree_ppsf:.3f}")

Complex_The Landon              0.560687
Complex_Bond on Mint            0.254396
Sqft                            0.183402
Bedrooms                        0.001516
Rent                            0.000000
Bathrooms                       0.000000
gym                             0.000000
laundry                         0.000000
parking                         0.000000
ev_charging                     0.000000
elevator                        0.000000
pool                            0.000000
secure_access                   0.000000
wifi                            0.000000
trash_pickup                    0.000000
wifi_common                     0.000000
renters_insurance               0.000000
packages                        0.000000
recycling                       0.000000
pets                            0.000000
Complex_Broadstone Craft        0.000000
Complex_Ello House              0.000000
Complex_Moderna Liberty Row     0.000000
Complex_Hawkins Press           0.000000
Complex_Novel Mallard Creek     0.000000
Complex_Solis Midtown           0.000000
Complex_The Henry               0.000000
Complex_The Leo LoSo            0.000000
Complex_The Perch               0.000000
Complex_Tyvola Tapestry         0.000000
Neighborhood_NoDa               0.000000
Neighborhood_South End          0.000000
Neighborhood_SouthPark          0.000000
Neighborhood_University City    0.000000
Neighborhood_Uptown             0.000000
Neighborhood_West Charlotte     0.000000
dtype: float64
R² for PPSF Decision Tree — Overall: 0.606

def profile_tree_segments(tree, X, y, original_df, target_col, unit_col="Sqft", complex_col="Complex", neighborhood_col="Neighborhood"):
    # Get leaf node assignment for each row in X
    leaf_ids = tree.apply(X)
    
    profiles = []
    for leaf in np.unique(leaf_ids):
        mask = leaf_ids == leaf
        # Use the same index as X to select from original_df
        segment_df = original_df.loc[X.index[mask]]
        
        avg_target = segment_df[target_col].mean()
        avg_size   = segment_df[unit_col].mean()
        
        dominant_complex = segment_df[complex_col].mode()[0] if not segment_df.empty else None
        dominant_neigh   = segment_df[neighborhood_col].mode()[0] if not segment_df.empty else None
        
        profiles.append({
            "Leaf_ID": leaf,
            "Avg_" + target_col: round(avg_target, 2),
            "Typical_Size": round(avg_size, 0),
            "Dominant_Complex": dominant_complex,
            "Dominant_Neighborhood": dominant_neigh,
            "Count": len(segment_df)
        })
    
    return pd.DataFrame(profiles)

# Example usage for PPSF overall tree
segment_profiles_ppsf = profile_tree_segments(
    tree_ppsf,
    X_train_p,   # features used to fit
    y_train_p,   # target
    apt_cleaned, # original df with Sqft, Complex, Neighborhood
    target_col="price_per_sqft"
)

print(segment_profiles_ppsf)

   Leaf_ID  Avg_price_per_sqft  Typical_Size Dominant_Complex  \
0        3                2.43         721.0       Ello House   
1        4                2.02        1205.0     The Leo LoSo   
2        6                3.23         631.0     Bond on Mint   
3        7                2.88        1134.0     Bond on Mint   
4       10                1.58         769.0       The Landon   
5       11                1.38         980.0       The Landon   
6       13                1.12        1215.0       The Landon   
7       14                1.26        1338.0       The Landon   

  Dominant_Neighborhood  Count  
0             South End    103  
1             South End     54  
2                Uptown     10  
3                Uptown      7  
4             SouthPark      5  
5             SouthPark      6  
6             SouthPark      6  
7             SouthPark      5

tree_rules = export_text(tree_ppsf, feature_names=list(X_ppsf.columns))
print(tree_rules)

|--- Complex_The Landon <= 0.50
|   |--- Complex_Bond on Mint <= 0.50
|   |   |--- Sqft <= 1001.00
|   |   |   |--- value: [2.43]
|   |   |--- Sqft >  1001.00
|   |   |   |--- value: [2.02]
|   |--- Complex_Bond on Mint >  0.50
|   |   |--- Sqft <= 751.50
|   |   |   |--- value: [3.23]
|   |   |--- Sqft >  751.50
|   |   |   |--- value: [2.88]
|--- Complex_The Landon >  0.50
|   |--- Sqft <= 1071.50
|   |   |--- Sqft <= 877.00
|   |   |   |--- value: [1.58]
|   |   |--- Sqft >  877.00
|   |   |   |--- value: [1.38]
|   |--- Sqft >  1071.50
|   |   |--- Bedrooms <= 2.50
|   |   |   |--- value: [1.12]
|   |   |--- Bedrooms >  2.50
|   |   |   |--- value: [1.26]

def extract_leaf_rules(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []

    def recurse(node, path, paths):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            left_path = list(path)
            left_path.append(f"{name} <= {threshold:.1f}")
            recurse(tree_.children_left[node], left_path, paths)

            right_path = list(path)
            right_path.append(f"{name} > {threshold:.1f}")
            recurse(tree_.children_right[node], right_path, paths)
        else:
            value = tree_.value[node][0][0]
            rule = " AND ".join(path)
            paths.append((rule, round(value, 3)))

    recurse(0, path, paths)
    return paths

# Run and print
rules = extract_leaf_rules(tree_ppsf, list(X_ppsf.columns))
for rule, value in rules:
    print(f"If {rule} → PPSF ≈ {value}")

If Complex_The Landon <= 0.5 AND Complex_Bond on Mint <= 0.5 AND Sqft <= 1001.0 → PPSF ≈ 2.431
If Complex_The Landon <= 0.5 AND Complex_Bond on Mint <= 0.5 AND Sqft > 1001.0 → PPSF ≈ 2.024
If Complex_The Landon <= 0.5 AND Complex_Bond on Mint > 0.5 AND Sqft <= 751.5 → PPSF ≈ 3.227
If Complex_The Landon <= 0.5 AND Complex_Bond on Mint > 0.5 AND Sqft > 751.5 → PPSF ≈ 2.876
If Complex_The Landon > 0.5 AND Sqft <= 1071.5 AND Sqft <= 877.0 → PPSF ≈ 1.581
If Complex_The Landon > 0.5 AND Sqft <= 1071.5 AND Sqft > 877.0 → PPSF ≈ 1.375
If Complex_The Landon > 0.5 AND Sqft > 1071.5 AND Bedrooms <= 2.5 → PPSF ≈ 1.12
If Complex_The Landon > 0.5 AND Sqft > 1071.5 AND Bedrooms > 2.5 → PPSF ≈ 1.264

# Drop non-predictive or text-heavy columns
apt_cleaned = apt.drop(columns=['Address', 'Unit_Variant', 'Amenities', 'Website'])

# Encode categorical columns
apt_encoded = pd.get_dummies(apt_cleaned, columns=['Complex', 'Neighborhood'], drop_first=False)

# Filter for small units (≤ 1010 sqft)
apt_small = apt_encoded[apt_encoded['Sqft'] <= 1010]

# Define target and features for PPSF (small units)
y_small = apt_small['price_per_sqft']
X_small = apt_small.drop(columns=['price_per_sqft'])

# Train/test split (distinct variable names for small units)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_small, y_small, test_size=0.2, random_state=42
)

# Fit decision tree for PPSF (small units)
tree_small = DecisionTreeRegressor(max_depth=3, min_samples_leaf=5, random_state=42)
tree_small.fit(X_train_s, y_train_s)

# Visualize the tree
plt.figure(figsize=(36, 20))
plot_tree(
    tree_small,
    feature_names=X_small.columns,
    filled=True,
    rounded=True,
    fontsize=14
)
plt.title("PPSF Decision Tree — Small Units (≤ 1010 sqft)", fontsize=18)
plt.show()

# Feature importance
importances_small = pd.Series(tree_small.feature_importances_, index=X_small.columns)
print(importances_small.sort_values(ascending=False))

# Calculate R² score for PPSF tree (small units)
r2_tree_small = tree_small.score(X_test_s, y_test_s)

# R² score for PPSF tree (small units)
print(f"\n📈 R² for PPSF Tree — Small Units: {r2_tree_small:.3f}")

Complex_The Landon              0.484874
Complex_Bond on Mint            0.322976
Complex_Solis Midtown           0.185790
Sqft                            0.006360
Rent                            0.000000
Bathrooms                       0.000000
Bedrooms                        0.000000
laundry                         0.000000
parking                         0.000000
ev_charging                     0.000000
gym                             0.000000
pool                            0.000000
secure_access                   0.000000
wifi                            0.000000
trash_pickup                    0.000000
wifi_common                     0.000000
renters_insurance               0.000000
packages                        0.000000
elevator                        0.000000
pets                            0.000000
Complex_Broadstone Craft        0.000000
recycling                       0.000000
Complex_Ello House              0.000000
Complex_Hawkins Press           0.000000
Complex_Novel Mallard Creek     0.000000
Complex_Moderna Liberty Row     0.000000
Complex_The Henry               0.000000
Complex_The Leo LoSo            0.000000
Complex_The Perch               0.000000
Complex_Tyvola Tapestry         0.000000
Neighborhood_NoDa               0.000000
Neighborhood_South End          0.000000
Neighborhood_SouthPark          0.000000
Neighborhood_University City    0.000000
Neighborhood_Uptown             0.000000
Neighborhood_West Charlotte     0.000000
dtype: float64

📈 R² for PPSF Tree — Small Units: 0.507

def profile_tree_segments(tree, X, y, original_df, target_col, unit_col="Sqft", complex_col="Complex", neighborhood_col="Neighborhood"):
    # Get leaf node assignment for each row in X
    leaf_ids = tree.apply(X)
    
    profiles = []
    for leaf in np.unique(leaf_ids):
        mask = leaf_ids == leaf
        # Use the same index as X to select from original_df
        segment_df = original_df.loc[X.index[mask]]
        
        avg_target = segment_df[target_col].mean()
        avg_size   = segment_df[unit_col].mean()
        
        dominant_complex = segment_df[complex_col].mode()[0] if not segment_df.empty else None
        dominant_neigh   = segment_df[neighborhood_col].mode()[0] if not segment_df.empty else None
        
        profiles.append({
            "Leaf_ID": leaf,
            "Avg_" + target_col: round(avg_target, 2),
            "Typical_Size": round(avg_size, 0),
            "Dominant_Complex": dominant_complex,
            "Dominant_Neighborhood": dominant_neigh,
            "Count": len(segment_df)
        })
    
    return pd.DataFrame(profiles)

# Example usage for PPSF overall tree
segment_profiles_ppsf = profile_tree_segments(
    tree_ppsf,
    X_train_p,   # features used to fit
    y_train_p,   # target
    apt_cleaned, # original df with Sqft, Complex, Neighborhood
    target_col="price_per_sqft"
)

print(segment_profiles_ppsf)

   Leaf_ID  Avg_price_per_sqft  Typical_Size Dominant_Complex  \
0        3                2.43         721.0       Ello House   
1        4                2.02        1205.0     The Leo LoSo   
2        6                3.23         631.0     Bond on Mint   
3        7                2.88        1134.0     Bond on Mint   
4       10                1.58         769.0       The Landon   
5       11                1.38         980.0       The Landon   
6       13                1.12        1215.0       The Landon   
7       14                1.26        1338.0       The Landon   

  Dominant_Neighborhood  Count  
0             South End    103  
1             South End     54  
2                Uptown     10  
3                Uptown      7  
4             SouthPark      5  
5             SouthPark      6  
6             SouthPark      6  
7             SouthPark      5

# Assuming your tree is named tree_small and was fit on X_small
tree_rules = export_text(tree_small, feature_names=list(X_small.columns), decimals=3)
print(tree_rules)

|--- Complex_The Landon <= 0.500
|   |--- Complex_Bond on Mint <= 0.500
|   |   |--- Complex_Solis Midtown <= 0.500
|   |   |   |--- value: [2.334]
|   |   |--- Complex_Solis Midtown >  0.500
|   |   |   |--- value: [2.899]
|   |--- Complex_Bond on Mint >  0.500
|   |   |--- Sqft <= 713.000
|   |   |   |--- value: [3.272]
|   |   |--- Sqft >  713.000
|   |   |   |--- value: [3.065]
|--- Complex_The Landon >  0.500
|   |--- value: [1.471]

def extract_leaf_rules(tree, feature_names):

    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []

    def recurse(node, path, paths):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            left_path = list(path)
            left_path.append(f"{name} <= {threshold:.1f}")
            recurse(tree_.children_left[node], left_path, paths)

            right_path = list(path)
            right_path.append(f"{name} > {threshold:.1f}")
            recurse(tree_.children_right[node], right_path, paths)
        else:
            value = tree_.value[node][0][0]
            rule = " AND ".join(path)
            paths.append((rule, round(value, 3)))

    recurse(0, path, paths)
    return paths

# Run and print
rules = extract_leaf_rules(tree_small, list(X_small.columns))
for rule, value in rules:
    print(f"If {rule} → PPSF ≈ {value}")

If Complex_The Landon <= 0.5 AND Complex_Bond on Mint <= 0.5 AND Complex_Solis Midtown <= 0.5 → PPSF ≈ 2.334
If Complex_The Landon <= 0.5 AND Complex_Bond on Mint <= 0.5 AND Complex_Solis Midtown > 0.5 → PPSF ≈ 2.899
If Complex_The Landon <= 0.5 AND Complex_Bond on Mint > 0.5 AND Sqft <= 713.0 → PPSF ≈ 3.272
If Complex_The Landon <= 0.5 AND Complex_Bond on Mint > 0.5 AND Sqft > 713.0 → PPSF ≈ 3.065
If Complex_The Landon > 0.5 → PPSF ≈ 1.471

# Drop non-predictive or text-heavy columns
apt_cleaned = apt.drop(columns=['Address', 'Unit_Variant', 'Amenities', 'Website'])

# Encode categorical columns
apt_encoded = pd.get_dummies(apt_cleaned, columns=['Complex', 'Neighborhood'], drop_first=False)

# Filter for large units (> 1010 sqft)
apt_large = apt_encoded[apt_encoded['Sqft'] > 1010]

# Define target and features for PPSF (large units)
y_large = apt_large['price_per_sqft']
X_large = apt_large.drop(columns=['price_per_sqft'])

# Train/test split (distinct variable names for large units)
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(
    X_large, y_large, test_size=0.2, random_state=42
)

# Fit decision tree for PPSF (large units)
tree_large = DecisionTreeRegressor(max_depth=4, min_samples_leaf=5, random_state=42)
tree_large.fit(X_train_l, y_train_l)

# Visualize the tree
plt.figure(figsize=(30, 16))
plot_tree(
    tree_large,
    feature_names=X_large.columns,
    filled=True,
    rounded=True,
    fontsize=12
)
plt.title("PPSF Decision Tree — Large Units (> 1010 sqft)", fontsize=18)
plt.show()

# Feature importance
importances_large = pd.Series(tree_large.feature_importances_, index=X_large.columns)
print("\n🔍 Feature Importances — Large Units (> 1010 sqft):\n")
print(importances_large.sort_values(ascending=False))

# R² score for PPSF tree (large units)
r2_tree_large = tree_large.score(X_test_l, y_test_l)
print(f"\n📈 R² for PPSF Tree — Large Units: {r2_tree_large:.3f}")

🔍 Feature Importances — Large Units (> 1010 sqft):

Rent                            0.927886
Sqft                            0.047403
Complex_The Leo LoSo            0.024711
Bathrooms                       0.000000
Bedrooms                        0.000000
laundry                         0.000000
gym                             0.000000
pool                            0.000000
parking                         0.000000
ev_charging                     0.000000
elevator                        0.000000
pets                            0.000000
wifi                            0.000000
wifi_common                     0.000000
trash_pickup                    0.000000
renters_insurance               0.000000
packages                        0.000000
recycling                       0.000000
Complex_Bond on Mint            0.000000
secure_access                   0.000000
Complex_Broadstone Craft        0.000000
Complex_Ello House              0.000000
Complex_Moderna Liberty Row     0.000000
Complex_Hawkins Press           0.000000
Complex_Novel Mallard Creek     0.000000
Complex_Solis Midtown           0.000000
Complex_The Henry               0.000000
Complex_The Landon              0.000000
Complex_The Perch               0.000000
Complex_Tyvola Tapestry         0.000000
Neighborhood_NoDa               0.000000
Neighborhood_South End          0.000000
Neighborhood_SouthPark          0.000000
Neighborhood_University City    0.000000
Neighborhood_Uptown             0.000000
Neighborhood_West Charlotte     0.000000
dtype: float64

📈 R² for PPSF Tree — Large Units: 0.916

def profile_tree_segments(tree, X, y, original_df, target_col, unit_col="Sqft", complex_col="Complex", neighborhood_col="Neighborhood"):
    # Get leaf node assignment for each row in X
    leaf_ids = tree.apply(X)
    
    profiles = []
    for leaf in np.unique(leaf_ids):
        mask = leaf_ids == leaf
        # Use the same index as X to select from original_df
        segment_df = original_df.loc[X.index[mask]]
        
        avg_target = segment_df[target_col].mean()
        avg_size   = segment_df[unit_col].mean()
        
        dominant_complex = segment_df[complex_col].mode()[0] if not segment_df.empty else None
        dominant_neigh   = segment_df[neighborhood_col].mode()[0] if not segment_df.empty else None
        
        profiles.append({
            "Leaf_ID": leaf,
            "Avg_" + target_col: round(avg_target, 2),
            "Typical_Size": round(avg_size, 0),
            "Dominant_Complex": dominant_complex,
            "Dominant_Neighborhood": dominant_neigh,
            "Count": len(segment_df)
        })
    
    return pd.DataFrame(profiles)

# Example usage for PPSF overall tree
segment_profiles_ppsf = profile_tree_segments(
    tree_ppsf,
    X_train_p,   # features used to fit
    y_train_p,   # target
    apt_cleaned, # original df with Sqft, Complex, Neighborhood
    target_col="price_per_sqft"
)

print(segment_profiles_ppsf)

   Leaf_ID  Avg_price_per_sqft  Typical_Size Dominant_Complex  \
0        3                2.43         721.0       Ello House   
1        4                2.02        1205.0     The Leo LoSo   
2        6                3.23         631.0     Bond on Mint   
3        7                2.88        1134.0     Bond on Mint   
4       10                1.58         769.0       The Landon   
5       11                1.38         980.0       The Landon   
6       13                1.12        1215.0       The Landon   
7       14                1.26        1338.0       The Landon   

  Dominant_Neighborhood  Count  
0             South End    103  
1             South End     54  
2                Uptown     10  
3                Uptown      7  
4             SouthPark      5  
5             SouthPark      6  
6             SouthPark      6  
7             SouthPark      5

# Assuming your tree is named tree_small and was fit on X_small
tree_rules = export_text(tree_large, feature_names=list(X_small.columns), decimals=3)
print(tree_rules)

|--- Rent <= 2573.500
|   |--- Rent <= 1792.500
|   |   |--- Rent <= 1537.500
|   |   |   |--- value: [1.133]
|   |   |--- Rent >  1537.500
|   |   |   |--- value: [1.426]
|   |--- Rent >  1792.500
|   |   |--- Complex_The Leo LoSo <= 0.500
|   |   |   |--- Sqft <= 1208.500
|   |   |   |   |--- value: [2.037]
|   |   |   |--- Sqft >  1208.500
|   |   |   |   |--- value: [1.820]
|   |   |--- Complex_The Leo LoSo >  0.500
|   |   |   |--- value: [1.663]
|--- Rent >  2573.500
|   |--- Rent <= 3252.000
|   |   |--- Sqft <= 1117.500
|   |   |   |--- value: [2.632]
|   |   |--- Sqft >  1117.500
|   |   |   |--- value: [2.233]
|   |--- Rent >  3252.000
|   |   |--- Sqft <= 1250.500
|   |   |   |--- value: [2.988]
|   |   |--- Sqft >  1250.500
|   |   |   |--- value: [2.716]

def extract_leaf_rules(tree, feature_names):

    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []

    def recurse(node, path, paths):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            left_path = list(path)
            left_path.append(f"{name} <= {threshold:.1f}")
            recurse(tree_.children_left[node], left_path, paths)

            right_path = list(path)
            right_path.append(f"{name} > {threshold:.1f}")
            recurse(tree_.children_right[node], right_path, paths)
        else:
            value = tree_.value[node][0][0]
            rule = " AND ".join(path)
            paths.append((rule, round(value, 3)))

    recurse(0, path, paths)
    return paths

# Run and print
rules = extract_leaf_rules(tree_large, list(X_small.columns))
for rule, value in rules:
    print(f"If {rule} → PPSF ≈ {value}")

If Rent <= 2573.5 AND Rent <= 1792.5 AND Rent <= 1537.5 → PPSF ≈ 1.133
If Rent <= 2573.5 AND Rent <= 1792.5 AND Rent > 1537.5 → PPSF ≈ 1.426
If Rent <= 2573.5 AND Rent > 1792.5 AND Complex_The Leo LoSo <= 0.5 AND Sqft <= 1208.5 → PPSF ≈ 2.037
If Rent <= 2573.5 AND Rent > 1792.5 AND Complex_The Leo LoSo <= 0.5 AND Sqft > 1208.5 → PPSF ≈ 1.82
If Rent <= 2573.5 AND Rent > 1792.5 AND Complex_The Leo LoSo > 0.5 → PPSF ≈ 1.663
If Rent > 2573.5 AND Rent <= 3252.0 AND Sqft <= 1117.5 → PPSF ≈ 2.632
If Rent > 2573.5 AND Rent <= 3252.0 AND Sqft > 1117.5 → PPSF ≈ 2.233
If Rent > 2573.5 AND Rent > 3252.0 AND Sqft <= 1250.5 → PPSF ≈ 2.988
If Rent > 2573.5 AND Rent > 3252.0 AND Sqft > 1250.5 → PPSF ≈ 2.716

# Drop non-predictive or text-heavy columns
apt_cleaned = apt.drop(columns=['Address', 'Unit_Variant', 'Amenities', 'Website'])

# Encode categorical columns
apt_encoded = pd.get_dummies(apt_cleaned, columns=['Complex', 'Neighborhood'], drop_first=False)

# Define target and features for Rent
y_rent = apt_encoded['Rent']
X_rent = apt_encoded.drop(columns=['Rent'])

# Train/test split (distinct variable names for Rent)
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X_rent, y_rent, test_size=0.2, random_state=42
)

# Fit decision tree for Rent
tree_rent = DecisionTreeRegressor(max_depth=3, min_samples_leaf=5, random_state=42)
tree_rent.fit(X_train_r, y_train_r)

# Visualize the tree
plt.figure(figsize=(36, 20))
plot_tree(
    tree_rent,
    feature_names=X_rent.columns,
    filled=True,
    rounded=True,
    fontsize=14
)
plt.title("Rent Decision Tree", fontsize=18)
plt.show()

# Feature importance
importances_rent = pd.Series(tree_rent.feature_importances_, index=X_rent.columns)
print(importances_rent.sort_values(ascending=False))

# R² score for Rent tree
r2_tree_rent = tree_rent.score(X_test_r, y_test_r)
print(f"R² for Rent Decision Tree: {r2_tree_rent:.3f}")

price_per_sqft                  0.54098
Sqft                            0.45902
Bathrooms                       0.00000
Bedrooms                        0.00000
laundry                         0.00000
pool                            0.00000
gym                             0.00000
pets                            0.00000
parking                         0.00000
ev_charging                     0.00000
elevator                        0.00000
secure_access                   0.00000
wifi                            0.00000
wifi_common                     0.00000
trash_pickup                    0.00000
renters_insurance               0.00000
packages                        0.00000
recycling                       0.00000
Complex_Bond on Mint            0.00000
Complex_Broadstone Craft        0.00000
Complex_Ello House              0.00000
Complex_Hawkins Press           0.00000
Complex_Moderna Liberty Row     0.00000
Complex_Novel Mallard Creek     0.00000
Complex_Solis Midtown           0.00000
Complex_The Henry               0.00000
Complex_The Landon              0.00000
Complex_The Leo LoSo            0.00000
Complex_The Perch               0.00000
Complex_Tyvola Tapestry         0.00000
Neighborhood_NoDa               0.00000
Neighborhood_South End          0.00000
Neighborhood_SouthPark          0.00000
Neighborhood_University City    0.00000
Neighborhood_Uptown             0.00000
Neighborhood_West Charlotte     0.00000
dtype: float64
R² for Rent Decision Tree: 0.747

def profile_tree_segments_rent(tree, X, y, original_df, unit_col="Sqft", complex_col="Complex", neighborhood_col="Neighborhood"):
    """
    Build mini segment profiles for each leaf node of a fitted Rent decision tree.
    
    Parameters:
    - tree: fitted DecisionTreeRegressor
    - X: features used to fit the tree (DataFrame)
    - y: target values (Series)
    - original_df: the original DataFrame with unit size, complex, neighborhood, and Rent
    - unit_col: column name for unit size (default "Sqft")
    - complex_col: column name for complex (default "Complex")
    - neighborhood_col: column name for neighborhood (default "Neighborhood")
    """
    
    # Get leaf node assignment for each row in X
    leaf_ids = tree.apply(X)
    
    profiles = []
    for leaf in np.unique(leaf_ids):
        mask = leaf_ids == leaf
        # Align mask with original_df using X's index
        segment_df = original_df.loc[X.index[mask]]
        
        avg_rent = segment_df["Rent"].mean()
        avg_size = segment_df[unit_col].mean()
        
        dominant_complex = segment_df[complex_col].mode()[0] if not segment_df.empty else None
        dominant_neigh   = segment_df[neighborhood_col].mode()[0] if not segment_df.empty else None
        
        profiles.append({
            "Leaf_ID": leaf,
            "Avg_Rent": round(avg_rent, 2),
            "Typical_Size": round(avg_size, 0),
            "Dominant_Complex": dominant_complex,
            "Dominant_Neighborhood": dominant_neigh,
            "Count": len(segment_df)
        })
    
    return pd.DataFrame(profiles)

# Example usage for Rent tree
segment_profiles_rent = profile_tree_segments_rent(
    tree_rent,
    X_train_r,   # features used to fit
    y_train_r,   # target
    apt_cleaned  # original df with Sqft, Complex, Neighborhood, Rent
)

print(segment_profiles_rent)

   Leaf_ID  Avg_Rent  Typical_Size  Dominant_Complex Dominant_Neighborhood  \
0        3   1236.48         852.0        The Landon             SouthPark   
1        4   1618.76         730.0  Broadstone Craft             South End   
2        6   1691.14         567.0      Bond on Mint                Uptown   
3        7   2230.15         762.0     Hawkins Press             South End   
4       10   1618.04        1205.0        The Landon             SouthPark   
5       11   2262.16        1211.0         The Perch                Uptown   
6       13   2847.53        1120.0        Ello House             South End   
7       14   3792.62        1426.0     Solis Midtown        West Charlotte   

   Count  
0      9  
1     72  
2     20  
3     20  
4     24  
5     22  
6     21  
7      8

# Assuming your tree is named tree_rent and was fit on X_rent
tree_rules_rent = export_text(
    tree_rent,
    feature_names=list(X_rent.columns),
    decimals=3
)

print(tree_rules_rent)

|--- Sqft <= 995.000
|   |--- price_per_sqft <= 2.595
|   |   |--- price_per_sqft <= 1.714
|   |   |   |--- value: [1236.481]
|   |   |--- price_per_sqft >  1.714
|   |   |   |--- value: [1618.762]
|   |--- price_per_sqft >  2.595
|   |   |--- Sqft <= 653.500
|   |   |   |--- value: [1691.142]
|   |   |--- Sqft >  653.500
|   |   |   |--- value: [2230.150]
|--- Sqft >  995.000
|   |--- price_per_sqft <= 2.105
|   |   |--- price_per_sqft <= 1.662
|   |   |   |--- value: [1618.042]
|   |   |--- price_per_sqft >  1.662
|   |   |   |--- value: [2262.159]
|   |--- price_per_sqft >  2.105
|   |   |--- Sqft <= 1320.000
|   |   |   |--- value: [2847.530]
|   |   |--- Sqft >  1320.000
|   |   |   |--- value: [3792.625]

# Step 1: Drop non-useful columns
apt_cleaned = apt.drop(columns=["Address", "Unit_Variant", "Amenities", "Website"], errors="ignore")

# Step 2: One-hot encode categorical variables
apt_encoded = pd.get_dummies(apt_cleaned, columns=["Complex", "Neighborhood"], drop_first=False)

# Step 3: Define target and features
target = "price_per_sqft"   # or "rent" if you want rent as target
X = apt_encoded.drop(columns=[target])
y = apt_encoded[target]

# Step 4: Force numeric conversion and drop rows with non-numeric or missing values
df_model = pd.concat([X, y], axis=1)
df_model = df_model.apply(pd.to_numeric, errors='coerce').dropna()

# Step 5: Separate cleaned features and target, force float64 dtype
X_clean = df_model.drop(columns=[target]).astype(np.float64)
y_clean = df_model[target].astype(np.float64)

# Step 6: Add constant for regression intercept
X_clean = sm.add_constant(X_clean)

# -------------------------
# R² scores for each tree
# -------------------------
r2_tree_ppsf  = tree_ppsf.score(X_test_p, y_test_p)     # PPSF overall
r2_tree_small = tree_small.score(X_test_s, y_test_s)    # PPSF small units
r2_tree_large = tree_large.score(X_test_l, y_test_l)    # PPSF large units
r2_tree_rent  = tree_rent.score(X_test_r, y_test_r)     # Rent overall

# -------------------------
# Summary table
# -------------------------
tree_r2_summary = pd.DataFrame({
    "Model": ["PPSF Overall", "PPSF ≤ 1010 sqft", "PPSF > 1010 sqft", "Rent Overall"],
    "R_squared": [r2_tree_ppsf, r2_tree_small, r2_tree_large, r2_tree_rent]
})

resume_percent_tree = round(tree_r2_summary["R_squared"].mean() * 100, 2)

print(tree_r2_summary)
print(f"\n🧠 Resume % (Avg R² across trees): {resume_percent_tree}%")

              Model  R_squared
0      PPSF Overall   0.605674
1  PPSF ≤ 1010 sqft   0.507014
2  PPSF > 1010 sqft   0.916093
3      Rent Overall   0.747423

🧠 Resume % (Avg R² across trees): 69.41%

# Original data
data = {
    "Model_3": 0.999747,
    "Model_6": 0.999401,
    "Model_20": 0.997807,
    "Model_10": 0.997807,
    "Model_5": 0.991060,
    "Model_24": 0.984648,
    "Model_16": 0.984114,
    "Model_4": 0.983144,
    "Model_22": 0.982825,
    "Model_23": 0.979736,
    "Model_9": 0.977991,
    "Model_18": 0.977991,
    "Model_17": 0.977600,
    "Model_12": 0.971687,
    "Model_19": 0.971687,
    "Model_21": 0.968111,
    "Model_14": 0.961383,
    "Model_7": 0.959890,
    "Model_8": 0.958345,
    "Model_13": 0.956600,
    "Model_1": 0.956078,
    "Model_2": 0.955864,
    "Model_15": 0.954173,
    "Model_11": 0.936356,
}

# Convert to DataFrame
df = pd.DataFrame(list(data.items()), columns=["Model", "R_squared"])

# Remove "Model_" prefix and convert to integer
df["Model"] = df["Model"].str.replace("Model_", "").astype(int)

# Sort by Model number
df = df.sort_values("Model").reset_index(drop=True)

# Compute average R²
average_r2 = df["R_squared"].mean()

# Add a row for the average
df.loc[len(df)] = ["Average", average_r2]

print(df)

# Print the Brain % line
brain_percent = round(average_r2 * 100, 2)
print(f"\n🧠 Resume % (avg R² across regressions): {brain_percent}%")

      Model  R_squared
0         1   0.956078
1         2   0.955864
2         3   0.999747
3         4   0.983144
4         5   0.991060
5         6   0.999401
6         7   0.959890
7         8   0.958345
8         9   0.977991
9        10   0.997807
10       11   0.936356
11       12   0.971687
12       13   0.956600
13       14   0.961383
14       15   0.954173
15       16   0.984114
16       17   0.977600
17       18   0.977991
18       19   0.971687
19       20   0.997807
20       21   0.968111
21       22   0.982825
22       23   0.979736
23       24   0.984648
24  Average   0.974335

🧠 Resume % (avg R² across regressions): 97.43%

# --- Synthetic dataset (Sqft vs Rent) ---
np.random.seed(42)
sqft = np.linspace(500, 1500, 100).reshape(-1, 1)
# True rent relationship: nonlinear with some noise
rent = 1.5 * sqft.flatten() + 500 + 200 * np.sin(sqft.flatten()/200) + np.random.normal(0, 100, 100)

# --- Decision Tree fit ---
tree = DecisionTreeRegressor(max_depth=3)
tree.fit(sqft, rent)
rent_tree_pred = tree.predict(sqft)

# --- Regression fit ---
reg = LinearRegression()
reg.fit(sqft, rent)
rent_reg_pred = reg.predict(sqft)

# --- Plot side-by-side ---
fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharey=True)

# Decision Tree plot
axes[0].scatter(sqft, rent, color="gray", alpha=0.6, label="Observed")
axes[0].plot(sqft, rent_tree_pred, color="red", linewidth=2, label="Tree prediction")
axes[0].set_title("Decision Tree Segmentation\nStepwise thresholds (R² ≈ 0.69)")
axes[0].set_xlabel("Sqft")
axes[0].set_ylabel("Rent")
axes[0].legend()

# Regression plot
axes[1].scatter(sqft, rent, color="gray", alpha=0.6, label="Observed")
axes[1].plot(sqft, rent_reg_pred, color="blue", linewidth=2, label="Regression fit")
axes[1].set_title("Regression Fit\nSmooth continuous line (R² ≈ 0.97)")
axes[1].set_xlabel("Sqft")
axes[1].legend()

plt.tight_layout()
plt.show()

Key Takeaways from the Descriptive Phase¶

Key Takeaways from the Geospatial Phase¶

Key Takeaways from the Regression Phase¶

Introduction to Decision Tree Modeling¶

PPSF¶

PPSF Decision Tree Leaf Summary¶

PPSF Decision Tree Rule Path¶

PPSF Decision Tree If-Then Rules¶

Conclusion on PPSF Decision Tree¶

PPSF Small¶

Small PPSF Decision Tree Leaf Summary¶

Small PPSF Decision Tree Rule Paths¶

Small PPSF Decision Tree If-Then Rules¶

Conclusion for Small PPSF Decision Tree¶

PPSF Large Decision Tree¶

Large PPSF Decision Tree Leaf Summary¶

Large PPSF Decision Tree Rule Paths¶

Large PPSF Decision Tree If-Then Rules¶

Conclusion for Large PPSF Decision Tree¶

Rent¶

Rent Decision Tree Leaf Summary¶

Rent Decision Tree Rule Paths¶

Rent Decision Tree Conclusion¶

Comparison of Decision Tree and Regression¶

Overall Conclusion¶

	Complex	Address	Unit_Variant	Bedrooms	Bathrooms	Rent	Sqft	Amenities	Website	Neighborhood	...	wifi_common
0	Moderna Liberty Row	7740 Liberty Row Dr, Charlotte, NC 28210	S01	0.0	1.0	1469.0	651.0	In-unit washer/dryer; High-speed internet in common areas; Controlled access bicycle storage; Additional storage available; Resort-style pool; 24-hour fitness center; Game room with billiards, pok...	https://www.moderalibertyrow.com/	SouthPark	...	1
1	Moderna Liberty Row	7740 Liberty Row Dr, Charlotte, NC 28210	A01	1.0	1.0	1707.0	747.0	In-unit washer/dryer; High-speed internet in common areas; Controlled access bicycle storage; Additional storage available; Resort-style pool; 24-hour fitness center; Game room with billiards, pok...	https://www.moderalibertyrow.com/	SouthPark	...	1
2	Moderna Liberty Row	7740 Liberty Row Dr, Charlotte, NC 28210	A02	1.0	1.0	1707.0	747.0	In-unit washer/dryer; High-speed internet in common areas; Controlled access bicycle storage; Additional storage available; Resort-style pool; 24-hour fitness center; Game room with billiards, pok...	https://www.moderalibertyrow.com/	SouthPark	...	1
3	Moderna Liberty Row	7740 Liberty Row Dr, Charlotte, NC 28210	A03	1.0	1.0	1532.0	801.0	In-unit washer/dryer; High-speed internet in common areas; Controlled access bicycle storage; Additional storage available; Resort-style pool; 24-hour fitness center; Game room with billiards, pok...	https://www.moderalibertyrow.com/	SouthPark	...	1
4	Moderna Liberty Row	7740 Liberty Row Dr, Charlotte, NC 28210	A04	1.0	1.0	1766.0	861.0	In-unit washer/dryer; High-speed internet in common areas; Controlled access bicycle storage; Additional storage available; Resort-style pool; 24-hour fitness center; Game room with billiards, pok...	https://www.moderalibertyrow.com/	SouthPark	...	1